required_packages <- c("readxl","dplyr", "nortest")
# Install and load packages if not already installed
for (package in required_packages) {
  if (!requireNamespace(package, quietly = TRUE)) {
    install.packages(package)
  }
  library(package, character.only = TRUE)
}

#import data frame
if (!exists("df")) {
  file_path <- "~/cohort.xlsx"
  df <- read_excel(file_path, sheet = "cohort", na = "#N/A")
}

#filter df and inform about exclusion
df_filtered <- subset(df, pet_distance_ic <= 365 | is.na(pet_distance_ic))
df_filtered <- subset(df_filtered, pet_distance_ic >= -1)

#Text for paper
print (paste("A total of ",count(df), "underwent immune-checkpoint inhibitor therapy..."))
print (paste("with chemotherapy (n=",sum(df_filtered$chemo == 1), ")"))
print (paste("and/or radiation therapy (n=",sum(df_filtered$pet_after_radia == 1), ")"))
print (paste(count(df)-count(df_filtered), "patients were excluded ..."))
print (paste("The remaining ",count(df_filtered), "patients are displayed in Table 1"))
print (paste("PET/CT data from ",length(unique(df_filtered$pet_location)), "different locations were used for this analysis"))
print (paste("In ",sum(df_filtered$pet_device == 6), "/",count(df), "patients the pretherapeutic FDG-PET/CT was acquired using the same imaging PET/CT scanner"))

#categorize values
count <- table(df_filtered$pet_device)
sorted <- sort(count, decreasing = TRUE)
top5 <- names(sorted)[1:5]

df_filtered <- df_filtered %>%
  mutate(
    staging_I = if_else(staging == 1, 1, 0),
    staging_II = if_else(staging == 2, 1, 0),
    staging_III = if_else(staging == 3, 1, 0),
    staging_IV = if_else(staging == 4, 1, 0),
    male = ifelse(sex_male == 1, 1, 0),
    female = ifelse(sex_male == 0, 1, 0),
    SiemensBiopraph20 = ifelse(pet_device == 1, 1, 0),
    SiemensBiopraph40 = ifelse(pet_device == 2, 1, 0),
    SiemensBiopraph64 = ifelse(pet_device == 3, 1, 0),
    GEDiscovery690 = ifelse(pet_device == 6, 1, 0),
    PhilipsGuardianBody = ifelse(pet_device == 8, 1, 0),
    otherDevices = ifelse(!pet_device %in% top5, 1, 0),
    pet_before_immunetherapy = ifelse(pet_distance_ic >= 0, 1, 0),
  )

#define variables
variables <- c("age", "bmi", "packyears","blood_hb","female", "male", "staging_I","staging_II","staging_III","staging_IV",
               "histo_nsclc_adeno", "histo_nsclc_squamous", "histo_sclc_neuroendocrine","histo_others", "pet_before_radia",
               "op_before_pet", "nicotine", "nicotine_female", "nicotine_male", "copd",
               "pericardial_effusion", "pleural_effusion", "diabetes", "chd",
               "GEDiscovery690", "SiemensBiopraph20","SiemensBiopraph40","SiemensBiopraph64", "PhilipsGuardianBody","otherDevices")

#define results df
results <- data.frame(variable = character(), mean = integer(), sd = numeric())

#build results with variables
for(i in 1:length(variables)){
  
  var_count <- sum(!is.na(df_filtered[[variables[i]]]))
  
  if(sum(df_filtered[[variables[i]]] %in% c(0, 1), na.rm = TRUE) == var_count){
    var_count_ones <- sum(df_filtered[[variables[i]]] == 1, na.rm = TRUE)
    var_percent <- round(var_count_ones / var_count * 100, 1)
    if (variables[i] == "nicotine_female") {
      var_percent <- round(var_count_ones / sum(df_filtered$female == 1, na.rm = TRUE) * 100, 1)
    }
    if (variables[i] == "nicotine_male") {
      var_percent <- round(var_count_ones / sum(df_filtered$male == 1, na.rm = TRUE) * 100, 1)
    }
    results[i,] <- c(variables[i], var_count_ones, var_percent)
  } 
  else if(var_count > 0){
    lillie_test <- lillie.test(df_filtered[[variables[i]]])
    if (lillie_test$p.value < 0.01) {
      #normal distribution
      var_mean <- round(mean(df_filtered[[variables[i]]], na.rm = TRUE), 2)
      var_sd <- round(sd(df_filtered[[variables[i]]], na.rm = TRUE), 2)
      results[i,] <- c(variables[i], var_mean, var_sd)
    }
    else {
      #not normal distribution
      var_median <- round(median(df_filtered[[variables[i]]], na.rm = TRUE), 2)
      var_iqr <- round(IQR(df_filtered[[variables[i]]], na.rm = TRUE), 2)
      results[i,] <- c(variables[i], var_median, var_iqr)
    }
  } else {
    results[i,] <- c(variables[i], "NA", "NA")
  }
}

#function for new rows in results df
insert_row <- function(variables, row){
  mean <- ""
  sd <- ""
  if (variables == "Variable") {
    mean <- "No."
    sd <- "%" 
  }
  if (variables == "start") {
    variables <- "Variable"
    mean <- "Mean / Median"
    sd <- "+/- SD / IQR"
  }
  new_row <- data.frame(variable = variables, mean, sd)
  results <<- rbind(results[0:row,], new_row, results[(row+1):nrow(results),])
}

#rename variables in df
results[results == "age"] <- "Age [y]"
results[results == "bmi"] <- "BMI [kg/m²]"
results[results == "packyears"] <- "Pack years [y]"
results[results == "blood_hb"] <- "Hb [g/dL]"
results[results == "female"] <- "Female"
results[results == "male"] <- "Male"
results[results == "staging_I"] <- "I"
results[results == "staging_II"] <- "II"
results[results == "staging_III"] <- "III"
results[results == "staging_IV"] <- "IV"
results[results == "histo_nsclc_adeno"] <- "NSCLC adenocarcinoma"
results[results == "histo_nsclc_squamous"] <- "NSCLC squamous cell carcinoma"
results[results == "histo_sclc_neuroendocrine"] <- "SCLC neuroendocrine carcinoma"
results[results == "histo_others"] <- "Other lung cancer histology"
results[results == "pet_before_radia"] <- "Thorax radiation before FDG-PET/CT"
results[results == "op_before_pet"] <- "Lung operation"
results[results == "nicotine"] <- "Nicotine consumption"
results[results == "nicotine_female"] <- "in female patients"
results[results == "nicotine_male"] <- "in male patients"
results[results == "copd"] <- "COPD"
results[results == "pericardial_effusion"] <- "Pericardial effusion"
results[results == "pleural_effusion"] <- "Pleural effusion"
results[results == "diabetes"] <- "Diabetes mellitus type II"
results[results == "chd"] <- "Coronary heart disease"
results[results == "SiemensBiopraph20"] <- "Siemens Biograph 20"
results[results == "SiemensBiopraph40"] <- "Siemens Biograph 40"
results[results == "SiemensBiopraph64"] <- "Siemens Biograph 64"
results[results == "GEDiscovery690"] <- "GE Discovery 690"
results[results == "PhilipsGuardianBody"] <- "Philips Guardian Body"
results[results == "otherDevices"] <- "Other scanners"

#add rows in results ds with function
insert_row("Physical characteristics", 0)
insert_row("start", 0)
insert_row("Smoking history", 4)
insert_row("Blood count", 6)
insert_row("Variable", 8)
insert_row("Physical characteristics", 9)
insert_row("Biological sex", 10)
insert_row("Tumor", 13)
insert_row("Staging", 14)
insert_row("Histology", 19)
insert_row("Clinical history", 24)
insert_row("Smoking history", 27)
insert_row("Comorbidities", 31)
insert_row("PET/CT Scanner", 37)

#save
print(results)
write.csv(results, "Table 1.csv", row.names = FALSE)
print("saved: Table 1.csv")
